#   LibAut
#       outcome regressions: multiple, leave-pair-out CV
#   Juraj Medzihorsky
#   2021-08-12

library(mgcv)
library(parallel)
options(mc.cores=14) # written for a 16-thread linux machine
options(stringsAsFactors=F)
source('helpers.R')

#   sessionInfo()
##  R version 4.0.4 (2021-02-15)
##  Platform: x86_64-pc-linux-gnu (64-bit)
##  Running under: Ubuntu 20.04.2 LTS
##
##  Matrix products: default
##  BLAS/LAPACK: /opt/OpenBLAS/lib/libopenblas_haswellp-r0.3.13.so
##
##  locale:
##   [1] LC_CTYPE=C.UTF-8           LC_NUMERIC=C
##   [3] LC_TIME=en_US.UTF-8        LC_COLLATE=C.UTF-8
##   [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=C.UTF-8
##   [7] LC_PAPER=C.UTF-8           LC_NAME=C
##   [9] LC_ADDRESS=C               LC_TELEPHONE=C
##  [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
##  attached base packages:
##  [1] parallel  stats     graphics  grDevices utils     datasets  methods
##  [8] base
##
##  other attached packages:
##  [1] mgcv_1.8-34    nlme_3.1-152   nvimcom_0.9-92
##
##  loaded via a namespace (and not attached):
##  [1] compiler_4.0.4  Matrix_1.3-2    tools_4.0.4     splines_4.0.4
##  [5] grid_4.0.4      lattice_0.20-41

#------------------------------------------------------------------------------

code_dir <- getwd()
data_dir <- gsub('code$', 'data', code_dir)

setwd(data_dir)
dat <- readRDS('outcome_multiple_20210810.rds')
setwd(code_dir)

dat$reg_fac <- as.factor(dat$e_regionpol_6C)
dat <- subset(dat, !(dem_ep_id%in%'HRV_1992_2004'))

#------------------------------------------------------------------------------
#   models

#   formulas
dummies <- c('reg_fac', 'e_miinteco', 'e_miinterc')
#
conties <- 
    c('e_migdppcln', 'e_migdpgro', 'logpop', 'v2pepwrsoc', 'v2xeg_eqdr',
      'v2xnp_pres', 'v2csprtcpt', 'v2x_polyarchy', 'excl_region_edi', 'year')  
#
dummies <- c(dummies[1], paste0('lag1_', dummies[2:3]))
conties <- c(paste0('lag1_', conties[1:9]), conties[10])
#
fy0 <- 'epy ~ 1'
fd <- sapply(paste0(fy0, ' + ', dummies), as.formula)
fc <- sapply(paste0(fy0, ' + ', 's(', conties, ', bs="gp")'), as.formula)
fc[[9]] <- as.formula(paste0(fy0, ' + ', 's(', conties[9], ', bs="re")'))


#   get fitted (predicted) values
getfitted <- 
    function(x) 
    {
        d <- x$model
        f <- predict(x, newdata=d, se.fit=T, type='response')
        d$fit <- f$fit
        d$se.fit <- f$se.fit
        return(d)
    }


#   binomial-logistic GLM under MLE for discrete X j and left-out-pair i
getoned <-
    function(i, j, dd=pd)
    {
        ii <- unlist(gp_d[[j]][i,])
        odat <- dd[[j]][-ii,]
        ndat <- dd[[j]][ ii,]
        m <- glm(fd[[j]], data=odat, family=binomial('logit'))
        p <- predict(m, newdata=ndat, type='response')
        auc <- as.numeric(p[1]<p[2])
        cc <- mean(round(p)==ndat$epy)
        return(data.frame(auc=auc, cc=cc))
    }


#   binomial-logistic GAM under MLE for continuous X j and left-out-pair i
getonec <-
    function(i, j, dd=pc)
    {
        ii <- unlist(gp_c[[j]][i,])
        odat <- dd[[j]][-ii,]
        ndat <- dd[[j]][ ii,]
        m <- gam(fc[[j]], data=odat, family=binomial('logit'))
        p <- predict(m, newdata=ndat, type='response')
        auc <- as.numeric(p[1]<p[2])
        cc <- mean(round(p)==ndat$epy)
        return(data.frame(auc=auc, cc=cc))
    }


#------------------------------------------------------------------------------
#   data prep

#   fit to the full data once
ld <- mclapply(fd, glm, data=dat, family=binomial('logit'))
lc <- mclapply(fc, gam, data=dat, family=binomial('logit'))
#   get the fitted values
pd <- mclapply(ld, getfitted) 
pc <- mclapply(lc, getfitted) 
#   find all possible 0-1 pairs within each subset
gp_d <- lapply(pd, function(x) expand.grid(y0=which(x$epy%in%0), y1=which(x$epy%in%1)))
gp_c <- lapply(pc, function(x) expand.grid(y0=which(x$epy%in%0), y1=which(x$epy%in%1)))
#   how many possible 0-1 pairs within each subset
n_d <- sapply(gp_d, nrow)
n_c <- sapply(gp_c, nrow)
#   prepare list for the output
out_d <- vector(length=length(fd), mode='list')
out_c <- vector(length=length(fc), mode='list')
names(out_d) <- dummies
names(out_c) <- conties

#------------------------------------------------------------------------------
#   execute: parallelized within loops

for (k in 1:length(out_d)) {
    rr <- n_d[k]
    out_d[[k]] <- do.call(rbind, mclapply(1:rr, function(ii) getoned(ii, j=k)))
}

for (k in 1:length(out_c)) {
    rr <- n_c[k]
    out_c[[k]] <- do.call(rbind, mclapply(1:rr, function(ii) getonec(ii, j=k)))
}

#------------------------------------------------------------------------------
#   save

setwd(data_dir)
save(list=c('out_d', 'out_c'),
     file='outcome_regressions_simple_lpo_20210812.rdata')

#   SCRIPT END